In [6]:
import feats
import utils
import constants
import transactions

import os
import pickle
import operator
import numpy as np
import pandas as pd
import seaborn as sns
from imp import reload
from matplotlib import pyplot as plt
from statsmodels.tsa.api import VAR
from scipy.spatial.distance import euclidean
from sklearn.utils.extmath import cartesian
from sklearn.feature_extraction.text import CountVectorizer
from pandas.tools.plotting import lag_plot, autocorrelation_plot

In [3]:
uo = tle.get_users_orders('prior')

In [4]:
up_pair = uo[['user_id', 'product_id']].drop_duplicates()

In [ ]:
order_products_train = tle.get_orders_items('train')

In [33]:
order_products_prior = tle.get_orders_items('prior')
orders = tle.get_orders()
products = tle.get_items('products')
aisles = tle.get_items('aisles')
departments = tle.get_items('departments')

In [12]:
products_details = pd.merge(products, tle.craft_feat_product(), on = ['product_id'], how = 'right')

1 None订单


In [6]:
order_is_None = order_products_train.groupby(['order_id'])['reordered'].sum().reset_index()

In [11]:
len(order_is_None[order_is_None.reordered == 0]) / len(order_is_None[order_is_None.reordered > 0])


Out[11]:
0.07015912631415824

In [19]:
a = pd.merge(order_is_None, orders, how = 'left', on = ['order_id'])

prior、train订单


In [65]:
order_products_all = pd.concat([order_products_prior, order_products_train], axis = 0)

2 How many products do users buy each time

  • 每张订单的商品数目

In [21]:
grouped = order_products_prior.groupby("order_id")["add_to_cart_order"].aggregate("max").reset_index()

In [22]:
grouped.add_to_cart_order.describe()


Out[22]:
count    3.214874e+06
mean     1.008888e+01
std      7.525398e+00
min      1.000000e+00
25%      5.000000e+00
50%      8.000000e+00
75%      1.400000e+01
max      1.450000e+02
Name: add_to_cart_order, dtype: float64

3 Do users purchase different numbers of products each time?

  • 用户每次购买的商品数目一样麽

In [23]:
grouped = pd.merge(grouped,
         orders,
         on = ['order_id'],
         how = 'left')[['user_id', 'add_to_cart_order', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']]

In [24]:
grouped = grouped.sort_values(['user_id', 'order_number'])

In [25]:
grouped.columns = ['user_id', 
                   'num_products', 
                   'order_number', 
                   'order_dow',
                   'order_hour_of_day', 
                   'days_since_prior_order']

In [18]:
user_num_product = grouped.groupby(['user_id'])['num_products'].agg({'mean':'mean', 'std':'std'})

In [ ]:
with open(DATA_DIR + 'user_num_product_stat.pkl', 'wb') as f:
    pickle.dump(user_num_product, f, pic)

In [3]:
with open(constants.FEAT_DATA_DIR + 'user_num_product_stat.pkl', 'rb') as f:
    user_num_product = pickle.load(f)

In [7]:
user_num_product['std'].describe()


Out[7]:
count    206209.000000
mean          4.266349
std           2.675061
min           0.000000
25%           2.345208
50%           3.781534
75%           5.609516
max          44.747439
Name: std, dtype: float64

4 Reorder Rate

  • 每张订单中重复购买商品比例

In [26]:
grouped = order_products_all.groupby("product_id")["reordered"].aggregate({'reorder_sum': sum,'reorder_total': 'count'}).reset_index()
grouped['reorder_probability'] = grouped['reorder_sum'] / grouped['reorder_total']
grouped = pd.merge(grouped, products[['product_id', 'product_name']], how='left', on=['product_id'])
grouped = grouped[grouped.reorder_total > 75].sort_values(['reorder_probability'], ascending=False)[:10]

In [27]:
prior_reorder_rate = order_products_prior.groupby(['order_id'])['reordered'] \
                                         .aggregate({'reorder_pnum':'sum', 'pnum':'count'})

In [28]:
prior_reorder_rate['reorder_rate'] = prior_reorder_rate['reorder_pnum'] / prior_reorder_rate['pnum']

In [29]:
prior_reorder_rate.reset_index(inplace=True)

In [30]:
prior_orders = orders[orders.eval_set == 'prior']

In [31]:
prior_orders = pd.merge(prior_orders, prior_reorder_rate,
                        on = ['order_id'], how = 'left')

In [32]:
prior_orders.head(5)


Out[32]:
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order days_up_to_last pnum reorder_pnum reorder_rate
0 2539329 1 prior 1 2 8 0.0 190.0 5 0 0.000
1 2398795 1 prior 2 3 7 15.0 175.0 6 3 0.500
2 473747 1 prior 3 3 12 21.0 154.0 5 3 0.600
3 2254736 1 prior 4 4 7 29.0 125.0 5 5 1.000
4 431534 1 prior 5 4 15 28.0 97.0 8 5 0.625

In [33]:
user_reorder_est = prior_orders.groupby(['user_id'])['reorder_pnum']\
                               .aggregate({'reorder_pnum_mean':'mean', 
                                           'reorder_pnum_std':'std'}).reset_index()

In [34]:
user_reorder_est = user_reorder_est[['user_id', 'reorder_pnum_mean', 'reorder_pnum_std']]

In [35]:
with open(constants.FEAT_DATA_DIR + 'user_reorder_est.pkl', 'wb') as f:
    pickle.dump(user_reorder_est, f, pickle.HIGHEST_PROTOCOL)

In [3]:
with open(constants.FEAT_DATA_DIR + 'user_reorder_est.pkl', 'rb') as f:
    user_reorder_est = pickle.load(f)

In [10]:
user_reorder_est.reorder_pnum_std.describe()


Out[10]:
count    206209.000000
mean          3.018932
std           2.104826
min           0.000000
25%           1.511858
50%           2.563480
75%           4.029652
max          31.210495
Name: reorder_pnum_std, dtype: float64

5 Products User Bought Previously


In [ ]:
users_products = pd.merge(prior_orders, order_products_prior, on = ['order_id'], how = 'left')

In [20]:
users_products = users_products.groupby(['user_id'])['product_id'].apply(list).reset_index()

In [22]:
with open(DATA_DIR + 'user_product.pkl', 'wb') as f:
    pickle.dump(users_products, f, pickle.HIGHEST_PROTOCOL)

In [3]:
with open(constants.FEAT_DATA_DIR + 'user_product.pkl', 'rb') as f:
    users_products = pickle.load(f)

In [6]:
l = users_products.product_id.apply(len)

In [10]:
l.describe()


Out[10]:
count    206209.000000
mean        157.289396
std         204.208233
min           3.000000
25%          39.000000
50%          83.000000
75%         188.000000
max        3725.000000
Name: product_id, dtype: float64

6 Candidate Products

  • last purchase
  • reorder items
  • all items that has high reorder rate
  • items that are added to cart first

In [394]:
grouped = order_products_all.groupby("product_id")["reordered"].aggregate({'reorder_sum': sum,'reorder_total': 'count'}).reset_index()
grouped['reorder_probability'] = grouped['reorder_sum'] / grouped['reorder_total']

7 Time of orders


In [13]:
grouped = orders.order_hour_of_day.value_counts()
sns.set_style('darkgrid')
sns.barplot(grouped.index, grouped.values)
plt.show()


/usr/local/lib/python3.5/dist-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

8 Topic Distance

  • user VS product prior中的所有(u,p)对
  • latest order VS product 通过LDA-transform来构造

In [15]:
# term-frequency matrix construct
orders = pd.read_csv(DATA_DIR + 'orders.csv')

users_orders = pd.merge(order_products_prior, orders[['user_id', 'order_id']], 
                        on = ['order_id'], how = 'left')

users_products_matrix = users_orders.groupby(['user_id'])['product_id'].apply(series_to_str)

tf = CountVectorizer(analyzer = 'word', lowercase = False, max_df=0.95, min_df=2,)
tf_matrix = tf.fit_transform(users_products_matrix.values)
tf_feature_names = tf.get_feature_names()

with open(DATA_DIR + 'tf.model', 'wb') as f:
    pickle.dump(tf, f, pickle.HIGHEST_PROTOCOL)

In [56]:
#订单的Topic, tf为CountVector,将文档转化为term-frequency矩阵
op = order_products_prior.groupby(['order_id'])['product_id'].apply(series_to_str)
topic_order = pd.DataFrame(lda.transform(tf.transform(op.values)), columns= ["topic_%d"%x for x in range(10)])
topic_order['order_id'] = op.index.values
with open(DATA_DIR + 'order_topic_norm.pkl', 'wb') as f:
    pickle.dump(topic_order_norm, f, pickle.HIGHEST_PROTOCOL)

In [65]:
up_distance = pd.merge(users_orders[['user_id', 'product_id']].drop_duplicates(),
                       user_topic, 
                       on = ['user_id'],
                       how = 'left')
up_distance.columns = ['user_id', 'product_id'] + ["u_topic_%d"%x for x in range(10)] 
up_distance = pd.merge(up_distance,
                       topic_product, 
                       on = ['product_id'],
                       how = 'left')
up_distance.columns = ['user_id', 'product_id'] + ["u_topic_%d"%x for x in range(10)] + ["p_topic_%d"%x for x in range(10)]

In [87]:
def cal_up_distance(subf):
    u_topic = subf[["u_topic_%d"%x for x in range(10)]]
    p_topic = subf[["p_topic_%d"%x for x in range(10)]]
    upd = euclidean(u_topic, p_topic)
    return upd

In [92]:
# 3 hours
up_distance['up_dis'] = up_distance.apply(cal_up_distance, axis = 1)


CPU times: user 3h 40min 56s, sys: 2min 4s, total: 3h 43min
Wall time: 3h 49min 41s

In [94]:
up_distance = up_distance[['user_id', 'product_id', 'up_dis']]
with open(DATA_DIR + 'upd_feat.pkl', 'wb') as f:
    pickle.dump(up_distance, f, pickle.HIGHEST_PROTOCOL)

9 Order Topic Construct

  • countvector, lda transform
  • 由商品的Topic构造订单的Topic表达
  • 商品加入购物车的次序??? 先忽视次序
  • 每个用户学习:加购物车次序 VS 重购? VS下张订单的Topic??

In [309]:
order_topic = pd.merge(order_products_prior[['order_id', 'product_id']],
                       topic_product,
                       on = ['product_id'],
                       how = 'inner')#throw stop words

In [312]:
order_topic = order_topic.groupby(['order_id'])[["topic_%d"%x for x in range(10)]].sum().reset_index()

In [314]:
unorm = order_topic[["topic_%d"%x for x in range(10)]].values

In [315]:
order_topic[["topic_%d"%x for x in range(10)]] = unorm / unorm.sum(axis = 1)[:,np.newaxis]

In [301]:
len(order_products_prior.product_id.unique())


Out[301]:
49677

In [302]:
len(topic_product.product_id.unique())


Out[302]:
49502

10 XGBoost Feature Preparation

  • 正负样本10:1

In [1]:
import constants, utils, transactions, feats
from imp import reload

In [3]:
tle = transactions.TransLogExtractor(constants.RAW_DATA_DIR, constants.FEAT_DATA_DIR)

In [33]:
train_none = feats.make_train_or_test_none(tle, 'train')


13307953

In [34]:
test_none = feats.make_train_or_test_none(tle, 'test')


13307953

In [4]:
train = feats.make_train_or_test(tle, 'train')


13307953

In [ ]:
utils.check_inf_nan(train[up_cols])

In [ ]:
utils.check_inf_nan(train[ua_cols])

In [ ]:
utils.check_inf_nan(train[ud_cols])

In [7]:
utils.check_inf_nan(train[p_cols])


Checking inf ...
Series([], dtype: float64)
Checking NAN ...
Index([], dtype='object')
Out[7]:
True

In [ ]:
utils.check_inf_nan(train[a_cols])

In [ ]:
utils.check_inf_nan(train[d_cols])

In [ ]:
utils.check_inf_nan(train[ctx_cols])

In [ ]:
utils.check_inf_nan(train[topic_cols])

11 LSTM Feature Preparation

  • (u,p,t)
  • 间隔、加购物车次序作为Symbol

    • 次序

      • 1
      • 2
      • 3
      • 4-6
      • 7-11
      • 12 ——
    • 间隔

      • 1 - 7
      • 8 - 16
      • 17 - 33
      • 34
      • 100 NAN
  • 实现

    • Encoder两个列, 总共30种符号
    • Cartesian查表
    • 直接数值

In [4]:
users_orders = tle.get_users_orders('prior')

In [ ]:
product_feat = tle.craft_feat_item('products')

In [255]:
user_feat = tle.craft_feat_user()

In [256]:
users_orders = pd.merge(users_orders, product_feat[['product_id', 'p_reorder_probability']], on=['product_id'], how='left')

In [257]:
users_orders = pd.merge(users_orders, user_feat[['user_id', 'u_total_reorders']], on=['user_id'], how='left')

In [258]:
def encode_numeric(row, bins):
    '''
    convert numeric variable into binned category
    bins = [b1, b2, b3, b4]
    '''
    index = ~(row < bins)
    return [bins[index][-1]]

In [321]:
add2cart_bins = np.array([1, 2, 3, 4, 7, 12], dtype=float) # 6
interval_bins = np.array([-1, 4, 8, 17, 34], dtype=float)# 5
p_reorder_bins = np.array([0.0, 0.20, 0.38, 0.53], dtype=float)# 4
u_reorder_bins = np.array([0, 10, 33, 101], dtype=float)# 4

In [ ]:
%%time
users_orders = users_orders.sort_values(['user_id', 'product_id', 'order_number'], ascending = False)
users_orders['up_interval'] = users_orders.groupby(['user_id', 'product_id'])['days_up_to_last'].diff()
users_orders.up_interval.fillna(-1, inplace=True)
users_orders['up_interval_sym'] = users_orders.up_interval.apply(lambda x: encode_numeric(x, interval_bins))
users_orders['up_add2cart_order_sym'] = users_orders.add_to_cart_order.apply(lambda x: encode_numeric(x, add2cart_bins))

In [265]:
users_orders['p_reorder_prob_sym'] = users_orders.p_reorder_probability.apply(lambda x: encode_numeric(x, p_reorder_bins))
users_orders['u_reorder_sym'] = users_orders.u_total_reorders.apply(lambda x:encode_numeric(x, u_reorder_bins))

In [322]:
feat_card = [add2cart_bins, interval_bins, p_reorder_bins, u_reorder_bins]

In [323]:
feat_cartesian = cartesian(feat_card)

In [327]:
users_orders['up_card'] = users_orders.up_add2cart_order_sym + users_orders.up_interval_sym + users_orders.p_reorder_prob_sym + users_orders.u_reorder_sym

In [337]:
def encode_cartesian(row, feat_cartesian):
    '''
        lookup table
        turn a group of categorical variable into a symbol
    '''
    sym = np.where(np.all(row == feat_cartesian,axis=1))[0][0] + 1
    return sym

In [340]:
%%time
users_orders['up_airr_sym'] = users_orders.up_card.apply(lambda x: encode_cartesian(x, feat_cartesian))


CPU times: user 5min 54s, sys: 6.16 s, total: 6min
Wall time: 5min 59s

In [352]:
up_airr_sym = users_orders[['user_id', 'product_id', 'order_number', 'up_airr_sym']]

In [354]:
up_airr_sym.sort_values(['user_id', 'product_id', 'order_number'], inplace=True)


/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

In [356]:
up_airr_sym_list = up_airr_sym.groupby(['user_id', 'product_id'])['up_airr_sym'].apply(list).reset_index()

In [358]:
with open(constants.FEAT_DATA_DIR + 'up_airr_sym.pkl', 'wb') as f:
    pickle.dump(up_airr_sym_list, f, pickle.HIGHEST_PROTOCOL)

(u,p)对时间间隔预测

Time Series Forcasting 问题

  • 方案1:用之前的Timestep对当前值进行回归预测
  • 方案2:LSTM 仅仅包含购买间隔信息

    • 样本(sample):(u,p,oid)
    • 特征(feature):两次购买之间的间隔
  • 预处理

    • 只出现一次的(u,p)无法计算间隔,NAN 丢弃
    • p_purchase_interval:距离下次购买的时间
    • 间隔为0的删除,同一天内购买两次视为一次
    • 为了training,间隔序列的长度 >=2 即(u,p)在prior里至少出现3次

In [3]:
users_orders = tle.get_users_orders(prior_or_train='prior')

In [4]:
a = users_orders[['user_id', 'order_number', 'product_id', 'days_up_to_last', 'p_purchase_interval']].sort_values(['user_id', 'order_number', 'p_purchase_interval'])

In [5]:
del users_orders

In [10]:
a.sort_values(['user_id', 'product_id', 'order_number'], ascending=False, inplace=True)

In [11]:
%%time
a['up_interval'] = a.head(1000).groupby(['user_id', 'product_id'])['days_up_to_last'].diff()


CPU times: user 704 ms, sys: 136 ms, total: 840 ms
Wall time: 839 ms

In [13]:
a.sort_values(['user_id', 'product_id'])


Out[13]:
user_id order_number product_id days_up_to_last p_purchase_interval up_interval
24181266 1 10 196 14.0 -1.0 NaN
21760446 1 9 196 44.0 30.0 NaN
29474806 1 8 196 44.0 0.0 NaN
5212927 1 7 196 58.0 14.0 NaN
31927070 1 6 196 78.0 20.0 NaN
4089398 1 5 196 97.0 19.0 NaN
21376074 1 4 196 125.0 28.0 NaN
4488095 1 3 196 154.0 29.0 NaN
22742744 1 2 196 175.0 21.0 NaN
24076664 1 1 196 190.0 15.0 NaN
24181271 1 10 10258 14.0 -1.0 NaN
21760447 1 9 10258 44.0 30.0 NaN
29474807 1 8 10258 44.0 0.0 NaN
5212928 1 7 10258 58.0 14.0 NaN
31927072 1 6 10258 78.0 20.0 NaN
4089400 1 5 10258 97.0 19.0 NaN
21376076 1 4 10258 125.0 28.0 NaN
4488097 1 3 10258 154.0 29.0 NaN
22742745 1 2 10258 175.0 21.0 NaN
4089402 1 5 10326 97.0 -1.0 NaN
24181274 1 10 12427 14.0 -1.0 NaN
21760448 1 9 12427 44.0 30.0 NaN
29474805 1 8 12427 44.0 0.0 NaN
5212929 1 7 12427 58.0 14.0 NaN
31927071 1 6 12427 78.0 20.0 NaN
4089399 1 5 12427 97.0 19.0 NaN
21376075 1 4 12427 125.0 28.0 NaN
4488096 1 3 12427 154.0 29.0 NaN
22742746 1 2 12427 175.0 21.0 NaN
24076666 1 1 12427 190.0 15.0 NaN
... ... ... ... ... ... ...
5934984 206209 12 38167 37.0 -1.0 NaN
24260103 206209 9 38167 114.0 77.0 77.0
20186696 206209 8 38167 136.0 22.0 22.0
14617226 206209 3 38167 203.0 67.0 67.0
28236066 206209 13 38730 30.0 -1.0 NaN
28236064 206209 13 39216 30.0 -1.0 NaN
17584595 206209 11 40310 55.0 -1.0 NaN
14617216 206209 3 40396 203.0 -1.0 NaN
29908312 206209 1 40396 240.0 37.0 37.0
14617215 206209 3 40534 203.0 -1.0 NaN
29908314 206209 1 40534 240.0 37.0 37.0
20186704 206209 8 40992 136.0 -1.0 NaN
5219651 206209 7 40992 158.0 22.0 22.0
6521430 206209 4 40992 173.0 15.0 15.0
5934982 206209 12 41213 37.0 -1.0 NaN
17584591 206209 11 41213 55.0 18.0 18.0
21489865 206209 10 41213 85.0 30.0 30.0
6521424 206209 4 41213 173.0 88.0 88.0
14617213 206209 3 41213 203.0 30.0 30.0
17910756 206209 2 41213 233.0 30.0 30.0
29908311 206209 1 41213 240.0 7.0 7.0
6521428 206209 4 41665 173.0 -1.0 NaN
5934987 206209 12 43961 37.0 -1.0 NaN
20186697 206209 8 43961 136.0 99.0 99.0
6521435 206209 4 43961 173.0 37.0 37.0
5219655 206209 7 44325 158.0 -1.0 NaN
17584596 206209 11 48370 55.0 -1.0 NaN
5219653 206209 7 48697 158.0 -1.0 NaN
5934990 206209 12 48742 37.0 -1.0 NaN
5219652 206209 7 48742 158.0 121.0 121.0

32434489 rows × 6 columns


In [4]:
print("number of (u,p,t) tuples: %d"%len(users_orders))


number of (u,p,t) tuples: 32434489

In [ ]:
del users_orders # free memory usage

In [12]:
users_orders_intervals = users_orders.dropna() #throw away product_id bought only once

In [16]:
users_orders_intervals = users_orders_intervals[users_orders_intervals.p_purchase_interval > 0] # throw away record buy in the same day

In [18]:
users_orders_intervals = users_orders_intervals.sort_values(['user_id', 'product_id', 'order_number'])

In [19]:
%%time
up_interval_list = users_orders_intervals.groupby(['user_id', 'product_id'])['p_purchase_interval'].apply(list).reset_index()


CPU times: user 3min 48s, sys: 12.5 s, total: 4min
Wall time: 4min 1s

In [20]:
len(up_interval_list)


Out[20]:
5279850

In [22]:
del users_orders_intervals # free memory usage

In [24]:
up_interval_list['len'] = up_interval_list.p_purchase_interval.apply(lambda x: len(x))

In [25]:
up_interval_list = up_interval_list[up_interval_list.len >= 2] # for train/test split

In [ ]:
with open(constants.FEAT_DATA_DIR + 'up_interval_feat.pkl', 'wb') as f:
    pickle.dump(up_interval_list, f, pickle.HIGHEST_PROTOCOL)

In [ ]:
len(up_interval_list)

In [ ]:
up_interval_list.len.describe()